/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.tools;
import java.io.*;
import java.util.*;
import java.net.*;
import java.util.logging.*;
import net.nutch.db.*;
import net.nutch.net.*;
import net.nutch.io.*;
import net.nutch.linkdb.*;
import net.nutch.pagedb.*;
import net.nutch.fetcher.*;
import net.nutch.util.*;
/*****************************************************
* This class takes the output of the fetcher and updates the page and link
* DBs accordingly. Eventually, as the database scales, this will broken into
* several phases, each consuming and emitting batch files, but, for now, we're
* doing it all here.
*
* @author Doug Cutting
*****************************************************/
public class UpdateDatabaseTool {
public static final float NEW_INTERNAL_LINK_FACTOR =
NutchConf.getFloat("db.score.link.internal", 1.0f);
public static final float NEW_EXTERNAL_LINK_FACTOR =
NutchConf.getFloat("db.score.link.external", 1.0f);
public static final int MAX_OUTLINKS_PER_PAGE =
NutchConf.getInt("db.max.outlinks.per.page", 100);
public static final boolean IGNORE_INTERNAL_LINKS =
NutchConf.getBoolean("db.ignore.internal.links", true);
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.tools.UpdateDatabaseTool");
private static final int MAX_RETRIES = 2;
private static final long MILLISECONDS_PER_DAY = 24 * 60 * 60 * 1000;
// back-compatibility hack for un-dated fetcher output
// delete after 1 June 2003
public static class FetcherOutputReader extends ArrayFile.Reader {
private long lastModified;
public FetcherOutputReader(String file) throws IOException {
super(file);
this.lastModified = new File(file).lastModified();
}
public Writable next(Writable value) throws IOException {
return checkFetchDate((FetcherOutput)super.next(value));
}
public Writable get(long n, Writable value) throws IOException {
return checkFetchDate((FetcherOutput)super.get(n, value));
}
private FetcherOutput checkFetchDate(FetcherOutput fo) {
if (fo != null && fo.getFetchDate() == 0)
// default fetchDate to file's lastModified
fo.setFetchDate(lastModified);
return fo;
}
}
private IWebDBWriter webdb;
private int maxCount = 0;
private boolean additionsAllowed = true;
private Set outlinkSet = new TreeSet(); // used in Page attr calculations
/**
* Take in the WebDBWriter, instantiated elsewhere.
*/
public UpdateDatabaseTool(IWebDBWriter webdb, boolean additionsAllowed, int maxCount) {
this.webdb = webdb;
this.additionsAllowed = additionsAllowed;
this.maxCount = maxCount;
}
/**
* Iterate through items in the FetcherOutput. For each one,
* determine whether the pages need to be added to the webdb,
* or what fields need to be changed.
*/
public void updateForSegment(String directory)
throws IOException {
ArrayList deleteQueue = new ArrayList();
String fetchDir=new File(directory, FetcherOutput.DIR_NAME).toString();
ArrayFile.Reader table = null;
int count = 0;
try {
table = new FetcherOutputReader(fetchDir);
FetcherOutput fo = new FetcherOutput();
while (table.next(fo) != null) {
if ((maxCount >= 0) && (count >= maxCount)) {
break;
}
FetchListEntry fle = fo.getFetchListEntry();
Page page = fle.getPage();
LOG.fine("Processing " + page.getURL());
if (!fle.getFetch()) { // didn't fetch
pageContentsUnchanged(fo); // treat as unchanged
} else if (fo.getStatus() == fo.SUCCESS) { // fetch succeed
if (fo.getMD5Hash().equals(page.getMD5())) {
pageContentsUnchanged(fo); // contents unchanged
} else {
pageContentsChanged(fo); // contents changed
}
} else if (fo.getStatus() == fo.RETRY &&
page.getRetriesSinceFetch() < MAX_RETRIES) {
pageRetry(fo); // retry later
} else {
pageGone(fo); // give up: page is gone
}
count++;
}
} catch (EOFException e) {
LOG.warning("Unexpected EOF in: " + fetchDir +
" at entry #" + count + ". Ignoring.");
} finally {
if (table != null)
table.close();
}
}
/**
* There's been no change: update date & retries only
*/
private void pageContentsUnchanged(FetcherOutput fetcherOutput)
throws IOException {
Page oldPage = fetcherOutput.getFetchListEntry().getPage();
Page newPage = (Page)oldPage.clone();
LOG.fine("unchanged");
newPage.setNextFetchTime(nextFetch(fetcherOutput)); // set next fetch
newPage.setRetriesSinceFetch(0); // zero retries
webdb.addPage(newPage); // update record in db
}
/**
* We've encountered new content, so update md5, etc.
* Also insert the new outlinks into the link DB
*/
private void pageContentsChanged(FetcherOutput fetcherOutput)
throws IOException {
Page oldPage = fetcherOutput.getFetchListEntry().getPage();
Page newPage = (Page)oldPage.clone();
LOG.fine("new contents");
newPage.setNextFetchTime(nextFetch(fetcherOutput)); // set next fetch
newPage.setMD5(fetcherOutput.getMD5Hash()); // update md5
newPage.setRetriesSinceFetch(0); // zero retries
// Go through all the outlinks from this page, and add to
// the LinkDB.
//
// If the replaced page is the last ref to its MD5, then
// its outlinks must be removed. The WebDBWriter will
// handle that, upon page-replacement.
//
Outlink[] outlinks = fetcherOutput.getOutlinks();
String sourceHost = getHost(oldPage.getURL().toString());
long sourceDomainID = newPage.computeDomainID();
long nextFetch = nextFetch(fetcherOutput, 0);
outlinkSet.clear(); // Use a hashtable to uniquify the links
int end = Math.min(outlinks.length, MAX_OUTLINKS_PER_PAGE);
for (int i = 0; i < end; i++) {
Outlink link = outlinks[i];
String url = link.getToUrl();
url = URLFilterFactory.getFilter().filter(url);
if (url == null)
continue;
outlinkSet.add(url);
if (additionsAllowed) {
String destHost = getHost(url);
boolean internal = destHost == null || destHost.equals(sourceHost);
try {
//
// If it is an in-site link, then we only add a Link if
// the Page is also added. So we pass it to addPageIfNotPresent().
//
// If it is not an in-site link, then we always add the link.
// We then conditionally add the Page with addPageIfNotPresent().
//
Link newLink = new Link(newPage.getMD5(), sourceDomainID, url, link.getAnchor());
float newScore = oldPage.getScore();
float newNextScore = oldPage.getNextScore();
if (internal) {
newScore *= NEW_INTERNAL_LINK_FACTOR;
newNextScore *= NEW_INTERNAL_LINK_FACTOR;
} else {
newScore *= NEW_EXTERNAL_LINK_FACTOR;
newNextScore *= NEW_EXTERNAL_LINK_FACTOR;
}
Page linkedPage = new Page(url, newScore, newNextScore, nextFetch);
if (internal && IGNORE_INTERNAL_LINKS) {
webdb.addPageIfNotPresent(linkedPage, newLink);
} else {
webdb.addLink(newLink);
webdb.addPageIfNotPresent(linkedPage);
}
} catch (MalformedURLException e) {
LOG.fine("skipping " + url + ":" + e);
}
}
}
// Calculate the number of different outlinks here.
// We use the outlinkSet TreeSet so that we count only
// the unique links leaving the Page. The WebDB will
// only store one value for each (fromID,toURL) pair
//
// Store the value with the Page, to speed up later
// Link Analysis computation.
//
// NOTE: This value won't necessarily even match what's
// in the linkdb! That's OK! It's more important that
// this number be a "true count" of the outlinks from
// the page in question, than the value reflect what's
// actually in our db. (There are a number of reasons,
// mainly space economy, to avoid placing URLs in our db.
// These reasons slightly pervert the "true out count".)
//
newPage.setNumOutlinks(outlinkSet.size()); // Store # outlinks
webdb.addPage(newPage); // update record in db
}
/**
* Keep the page, but never re-fetch it.
*/
private void pageGone(FetcherOutput fetcherOutput)
throws IOException {
Page oldPage = fetcherOutput.getFetchListEntry().getPage();
Page newPage = (Page)oldPage.clone();
LOG.fine("retry never");
newPage.setNextFetchTime(Long.MAX_VALUE); // never refetch
webdb.addPage(newPage); // update record in db
}
/**
* Update with new retry count and date
*/
private void pageRetry(FetcherOutput fetcherOutput)
throws IOException {
Page oldPage = fetcherOutput.getFetchListEntry().getPage();
Page newPage = (Page)oldPage.clone();
LOG.fine("retry later");
newPage.setNextFetchTime(nextFetch(fetcherOutput,1)); // wait a day
newPage.setRetriesSinceFetch
(oldPage.getRetriesSinceFetch()+1); // increment retries
webdb.addPage(newPage); // update record in db
}
/**
* Compute the next fetchtime for the Page.
*/
private long nextFetch(FetcherOutput fo) {
return nextFetch(fo,
fo.getFetchListEntry().getPage().getFetchInterval());
}
/**
* Compute the next fetchtime, from this moment, with the given
* number of days.
*/
private long nextFetch(FetcherOutput fetcherOutput, int days) {
return fetcherOutput.getFetchDate() + (MILLISECONDS_PER_DAY * days);
}
/**
* Parse the hostname from a URL and return it.
*/
private String getHost(String url) {
try {
return new URL(url).getHost().toLowerCase();
} catch (MalformedURLException e) {
return null;
}
}
/**
* Shut everything down.
*/
public void close() throws IOException {
webdb.close();
}
/**
* Create the UpdateDatabaseTool, and pass in a WebDBWriter.
*/
public static void main(String args[]) throws Exception {
File dbDir = null;
int segDirStart = -1;
int max = -1;
boolean additionsAllowed = true;
String usage = "UpdateDatabaseTool [-max N] [-noAdditions] db_dir seg_dir [ seg_dir ... ]";
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-max")) { // found -max option
max = Integer.parseInt(args[++i]);
} else if (args[i].equals("-noAdditions")) {
additionsAllowed = false;
} else if (dbDir == null) {
dbDir = new File(args[i]);
} else {
segDirStart = i;
break;
}
}
if (segDirStart == -1) {
System.err.println(usage);
System.exit(-1);
}
LOG.info("Updating " + dbDir);
IWebDBWriter webdb = new WebDBWriter(dbDir);
UpdateDatabaseTool tool = new UpdateDatabaseTool(webdb, additionsAllowed, max);
for (int i = segDirStart; i < args.length; i++) {
String segDir = args[i];
LOG.info("Updating for " + segDir);
tool.updateForSegment(segDir);
}
LOG.info("Finishing update");
tool.close();
LOG.info("Update finished");
}
}